/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */
#include "sched.h"

/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;

/*
 * period over which we measure -rt task cpu usage in us.
 * default: 1s
 */
unsigned int sysctl_sched_rt_period = 1000000;

int sysctl_sched_rr_timeslice = RR_TIMESLICE;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

struct rt_bandwidth def_rt_bandwidth;

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
    struct rt_bandwidth *rt_b =
        container_of(timer, struct rt_bandwidth, rt_period_timer);
    ktime_t now;
    int overrun;
    int idle = 0;

    for (;;) {
        now = hrtimer_cb_get_time(timer);
        overrun = hrtimer_forward(timer, now, rt_b->rt_period);

        if (!overrun)
            break;

        idle = do_sched_rt_period_timer(rt_b, overrun);
    }

    return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
    rt_b->rt_period = ns_to_ktime(period);
    rt_b->rt_runtime = runtime;

    raw_spin_lock_init(&rt_b->rt_runtime_lock);

    hrtimer_init(&rt_b->rt_period_timer,
            CLOCK_MONOTONIC, HRTIMER_MODE_REL);
    rt_b->rt_period_timer.function = sched_rt_period_timer;
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
    if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
        return;

    if (hrtimer_active(&rt_b->rt_period_timer))
        return;

    raw_spin_lock(&rt_b->rt_runtime_lock);
    start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
    raw_spin_unlock(&rt_b->rt_runtime_lock);
}

void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
{
    struct rt_prio_array *array;
    int i;

    array = &rt_rq->active;
    for (i = 0; i < MAX_RT_PRIO; i++) {
        INIT_LIST_HEAD(array->queue + i);
        __clear_bit(i, array->bitmap);
    }
    /* delimiter for bitsearch: */
    __set_bit(MAX_RT_PRIO, array->bitmap);

    rt_rq->rt_time = 0;
    rt_rq->rt_throttled = 0;
    rt_rq->rt_runtime = 0;
    raw_spin_lock_init(&rt_rq->rt_runtime_lock);
}

#define rt_entity_is_task(rt_se) (1)

static inline struct tcb *rt_task_of(struct sched_rt_entity *rt_se)
{
    return container_of(rt_se, struct tcb, rt);
}

static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
    return container_of(rt_rq, struct rq, rt);
}

static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
    struct tcb *p = rt_task_of(rt_se);
    struct rq *rq = task_rq(p);

    return &rq->rt;
}

static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
    return !list_empty(&rt_se->run_list);
}

static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
    return rt_rq->rt_runtime;
}

static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{
    return ktime_to_ns(def_rt_bandwidth.rt_period);
}

typedef struct rt_rq *rt_rq_iter_t;

#define for_each_rt_rq(rt_rq, iter, rq) \
    for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)

#define for_each_sched_rt_entity(rt_se) \
    for (; rt_se; rt_se = NULL)

static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
    return NULL;
}

static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
    if (rt_rq->rt_nr_running)
        resched_task(rq_of_rt_rq(rt_rq)->curr);
}

static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
}

static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
    return rt_rq->rt_throttled;
}

static inline const struct cpumask *sched_rt_period_mask(void)
{
    return cpu_online_mask;
}

static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
    return &cpu_rq(cpu)->rt;
}

static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{
    return &def_rt_bandwidth;
}

bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{
    struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);

    return (hrtimer_active(&rt_b->rt_period_timer) ||
        rt_rq->rt_time < rt_b->rt_runtime);
}

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{
    int i, idle = 1, throttled = 0;
    const struct cpumask *span;

    span = sched_rt_period_mask();
    for_each_cpu(i, span) {
        int enqueue = 0;
        struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
        struct rq *rq = rq_of_rt_rq(rt_rq);

        spin_lock(&rq->lock);
        if (rt_rq->rt_time) {
            u64 runtime;

            raw_spin_lock(&rt_rq->rt_runtime_lock);
            runtime = rt_rq->rt_runtime;
            rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
            if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                rt_rq->rt_throttled = 0;
                enqueue = 1;

                /*
                 * Force a clock update if the CPU was idle,
                 * lest wakeup -> unthrottle time accumulate.
                 */
                if (rt_rq->rt_nr_running && rq->curr == rq->idle)
                    rq->skip_clock_update = -1;
            }
            if (rt_rq->rt_time || rt_rq->rt_nr_running)
                idle = 0;
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        } else if (rt_rq->rt_nr_running) {
            idle = 0;
            if (!rt_rq_throttled(rt_rq))
                enqueue = 1;
        }
        if (rt_rq->rt_throttled)
            throttled = 1;

        if (enqueue)
            sched_rt_rq_enqueue(rt_rq);
        spin_unlock(&rq->lock);
    }

    if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
        return 1;

    return idle;
}

static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
    return rt_task_of(rt_se)->prio;
}

static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{
    u64 runtime = sched_rt_runtime(rt_rq);

    if (rt_rq->rt_throttled)
        return rt_rq_throttled(rt_rq);

    if (runtime >= sched_rt_period(rt_rq))
        return 0;

    runtime = sched_rt_runtime(rt_rq);
    if (runtime == RUNTIME_INF)
        return 0;

    if (rt_rq->rt_time > runtime) {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);

        /*
         * Don't actually throttle groups that have no runtime assigned
         * but accrue some time due to boosting.
         */
        if (likely(rt_b->rt_runtime)) {
            static bool once = false;

            rt_rq->rt_throttled = 1;

            if (!once) {
                once = true;
                pr_warn("sched: RT throttling activated\n");
            }
        } else {
            /*
             * In case we did anyway, make it go away,
             * replenishment is a joke, since it will replenish us
             * with exactly 0 ns.
             */
            rt_rq->rt_time = 0;
        }

        if (rt_rq_throttled(rt_rq)) {
            sched_rt_rq_dequeue(rt_rq);
            return 1;
        }
    }

    return 0;
}

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static void update_curr_rt(struct rq *rq)
{
    struct tcb *curr = rq->curr;
    struct sched_rt_entity *rt_se = &curr->rt;
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
    u64 delta_exec;

    if (curr->sched_class != &rt_sched_class)
        return;

    delta_exec = rq_clock_task(rq) - curr->se.exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;

    curr->se.sum_exec_runtime += delta_exec;

    curr->se.exec_start = rq_clock_task(rq);

    if (!rt_bandwidth_enabled())
        return;

    for_each_sched_rt_entity(rt_se) {
        rt_rq = rt_rq_of_se(rt_se);

        if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
            raw_spin_lock(&rt_rq->rt_runtime_lock);
            rt_rq->rt_time += delta_exec;
            if (sched_rt_runtime_exceeded(rt_rq))
                resched_task(curr);
            raw_spin_unlock(&rt_rq->rt_runtime_lock);
        }
    }
}

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
    start_rt_bandwidth(&def_rt_bandwidth);
}

static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}

static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
    int prio = rt_se_prio(rt_se);

    WARN_ON(!rt_prio(prio));
    rt_rq->rt_nr_running++;

    inc_rt_group(rt_se, rt_rq);
}

static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
    WARN_ON(!rt_prio(rt_se_prio(rt_se)));
    WARN_ON(!rt_rq->rt_nr_running);
    rt_rq->rt_nr_running--;

    dec_rt_group(rt_se, rt_rq);
}

static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
    struct rt_prio_array *array = &rt_rq->active;
    struct rt_rq *group_rq = group_rt_rq(rt_se);
    struct list_head *queue = array->queue + rt_se_prio(rt_se);

    /*
     * Don't enqueue the group if its throttled, or when empty.
     * The latter is a consequence of the former when a child group
     * get throttled and the current group doesn't have any other
     * active members.
     */
    if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
        return;

    if (head)
        list_add(&rt_se->run_list, queue);
    else
        list_add_tail(&rt_se->run_list, queue);
    __set_bit(rt_se_prio(rt_se), array->bitmap);

    inc_rt_tasks(rt_se, rt_rq);
}

static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
    struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
    struct rt_prio_array *array = &rt_rq->active;

    list_del_init(&rt_se->run_list);
    if (list_empty(array->queue + rt_se_prio(rt_se)))
        __clear_bit(rt_se_prio(rt_se), array->bitmap);

    dec_rt_tasks(rt_se, rt_rq);
}

/*
 * Because the prio of an upper entry depends on the lower
 * entries, we must remove entries top - down.
 */
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
{
    struct sched_rt_entity *back = NULL;

    for_each_sched_rt_entity(rt_se) {
        rt_se->back = back;
        back = rt_se;
    }

    for (rt_se = back; rt_se; rt_se = rt_se->back) {
        if (on_rt_rq(rt_se))
            __dequeue_rt_entity(rt_se);
    }
}

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
    dequeue_rt_stack(rt_se);
    for_each_sched_rt_entity(rt_se)
        __enqueue_rt_entity(rt_se, head);
}

static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
{
    dequeue_rt_stack(rt_se);

    for_each_sched_rt_entity(rt_se) {
        struct rt_rq *rt_rq = group_rt_rq(rt_se);

        if (rt_rq && rt_rq->rt_nr_running)
            __enqueue_rt_entity(rt_se, false);
    }
}

/*
 * Adding/removing a task to/from a priority array:
 */
static void
enqueue_task_rt(struct rq *rq, struct tcb *p, int flags)
{
    struct sched_rt_entity *rt_se = &p->rt;

    enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
}

static void dequeue_task_rt(struct rq *rq, struct tcb *p, int flags)
{
    struct sched_rt_entity *rt_se = &p->rt;

    update_curr_rt(rq);
    dequeue_rt_entity(rt_se);
}

/*
 * Put task to the head or the end of the run list without the overhead of
 * dequeue followed by enqueue.
 */
static void
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
{
    if (on_rt_rq(rt_se)) {
        struct rt_prio_array *array = &rt_rq->active;
        struct list_head *queue = array->queue + rt_se_prio(rt_se);

        if (head)
            list_move(&rt_se->run_list, queue);
        else
            list_move_tail(&rt_se->run_list, queue);
    }
}

static void requeue_task_rt(struct rq *rq, struct tcb *p, int head)
{
    struct sched_rt_entity *rt_se = &p->rt;
    struct rt_rq *rt_rq;

    for_each_sched_rt_entity(rt_se) {
        rt_rq = rt_rq_of_se(rt_se);
        requeue_rt_entity(rt_rq, rt_se, head);
    }
}

static void yield_task_rt(struct rq *rq)
{
    requeue_task_rt(rq, rq->curr, 0);
}

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void check_preempt_curr_rt(struct rq *rq, struct tcb *p, int flags)
{
    if (p->prio < rq->curr->prio) {
        resched_task(rq->curr);
        return;
    }
}

static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
                           struct rt_rq *rt_rq)
{
    struct rt_prio_array *array = &rt_rq->active;
    struct sched_rt_entity *next = NULL;
    struct list_head *queue;
    int idx;

    idx = sched_find_first_bit(array->bitmap);
    BUG_ON(idx >= MAX_RT_PRIO);

    queue = array->queue + idx;
    next = list_entry(queue->next, struct sched_rt_entity, run_list);

    return next;
}

static struct tcb *_pick_next_task_rt(struct rq *rq)
{
    struct sched_rt_entity *rt_se;
    struct tcb *p;
    struct rt_rq *rt_rq;

    rt_rq = &rq->rt;

    if (!rt_rq->rt_nr_running)
        return NULL;

    if (rt_rq_throttled(rt_rq))
        return NULL;

    do {
        rt_se = pick_next_rt_entity(rq, rt_rq);
        BUG_ON(!rt_se);
        rt_rq = group_rt_rq(rt_se);
    } while (rt_rq);

    p = rt_task_of(rt_se);
    p->se.exec_start = rq_clock_task(rq);

    return p;
}

static struct tcb *pick_next_task_rt(struct rq *rq)
{
    return _pick_next_task_rt(rq);
}

static void put_prev_task_rt(struct rq *rq, struct tcb *p)
{
    update_curr_rt(rq);
}

/*
 * When switching a task to RT, we may overload the runqueue
 * with RT tasks. In this case we try to push them off to
 * other runqueues.
 */
static void switched_to_rt(struct rq *rq, struct tcb *p)
{
    /*
     * If we are already running, then there's nothing
     * that needs to be done. But if we are not running
     * we may need to preempt the current running task.
     * If that current running task is also an RT task
     * then see if we can move to another run queue.
     */
    if (p->on_rq && rq->curr != p) {
        if (p->prio < rq->curr->prio)
            resched_task(rq->curr);
    }
}

/*
 * Priority of the task has changed. This may cause
 * us to initiate a push or pull.
 */
static void
prio_changed_rt(struct rq *rq, struct tcb *p, int oldprio)
{
    if (!p->on_rq)
        return;

    if (rq->curr == p) {
        /* For UP simply resched on drop of prio */
        if (oldprio < p->prio)
            resched_task(p);
    } else {
        /*
         * This task is not running, but if it is
         * greater than the current running task
         * then reschedule.
         */
        if (p->prio < rq->curr->prio)
            resched_task(rq->curr);
    }
}

static void task_tick_rt(struct rq *rq, struct tcb *p, int queued)
{
    struct sched_rt_entity *rt_se = &p->rt;

    update_curr_rt(rq);

    /*
     * RR tasks need a special form of timeslice management.
     * FIFO tasks have no timeslices.
     */
    if (p->policy != SEMINIX_SCHED_RR)
        return;

    if (--p->rt.time_slice)
        return;

    p->rt.time_slice = sysctl_sched_rr_timeslice;

    /*
     * Requeue to the end of queue if we (and all of our ancestors) are not
     * the only element on the queue
     */
    for_each_sched_rt_entity(rt_se) {
        if (rt_se->run_list.prev != rt_se->run_list.next) {
            requeue_task_rt(rq, p, 0);
            set_tsk_need_resched(p);
            return;
        }
    }
}

static void set_curr_task_rt(struct rq *rq)
{
    struct tcb *p = rq->curr;

    p->se.exec_start = rq_clock_task(rq);
}

static unsigned int get_rr_interval_rt(struct rq *rq, struct tcb *task)
{
    /*
     * Time slice is 0 for SCHED_FIFO tasks
     */
    if (task->policy == SEMINIX_SCHED_RR)
        return sysctl_sched_rr_timeslice;
    else
        return 0;
}

const struct sched_class rt_sched_class = {
    .next               = &fair_sched_class,

    .enqueue_task       = enqueue_task_rt,
    .dequeue_task       = dequeue_task_rt,

    .yield_task	        = yield_task_rt,

    .check_preempt_curr = check_preempt_curr_rt,

    .pick_next_task     = pick_next_task_rt,
    .put_prev_task      = put_prev_task_rt,

    .set_curr_task      = set_curr_task_rt,
    .task_tick          = task_tick_rt,

    .prio_changed       = prio_changed_rt,
    .switched_to        = switched_to_rt,

    .get_rr_interval    = get_rr_interval_rt,
};
